Preparation

# install stuff that we need later
if (!require("DT")) install.packages('DT')
if (!require("ggplot2")) install.packages('ggplot2')
if (!require("tidyverse")) install.packages('tidyverse')
if (!require("hrbrthemes")) install.packages('hrbrthemes')
if (!require("dplyr")) install.packages('dplyr')

# Load stuff we need later
library(readr)
library(DT)
library(ggplot2)
library(tidyverse)
library(hrbrthemes)
library(dplyr)
library(scales)

# and set the working directory
setwd("~/projects/bbs-for-independence/03_workspace")

Import Data

# Read dataset summary from csv
dataset <- read_csv("./models/dataset.csv", show_col_types = FALSE)
dataset$charratioDelta = dataset$charratioB - dataset$charratioA

Prepare Data

# Check the average of length, length_raw, avgcolumnsize, charratioA and charratioB
df = aggregate(x = dataset[, c(4,5,6,7,8,13)], 
               by = list(dataset$category), 
               FUN = function(x) list(
                 mean = round(mean(suppressWarnings(as.numeric(as.character(x))), na.rm=TRUE), digits = 2), 
                 n = length(x)))
df <- do.call(data.frame, df) # bind columns which contain matrices back into the data frame
df <- as.data.frame(lapply(df, unlist)) # convert lists back to vectors

f_selection <- dataset %>% filter(!category %in% c("fidonet-on-the-internet", "tap", "floppies", 
                                                   "exhibits", "artifacts", "piracy", "art", 
                                                   "magazines", "digest"))
f_magazins <- dataset %>% filter(category == "magazines")
f_digest <- dataset %>% filter(category == "digest")

fun_charratioB_selection <- Vectorize( function(x) { nrow(f_selection %>% filter(charratioB > x)) } )
fun_charratioB_magazins <- Vectorize( function(x) { nrow(f_magazins %>% filter(charratioB > x)) } )
fun_charratioB_digest <- Vectorize( function(x) { nrow(f_digest %>% filter(charratioB > x)) } )

data_fun <- data.frame(x = seq(0,1,0.01),
                       n = c(fun_charratioB_selection(seq(0,1,0.01)),
                                  fun_charratioB_magazins(seq(0,1,0.01)),
                                  fun_charratioB_digest(seq(0,1,0.01))),
                       categories = rep(c("selection", "magazins", "digest"), each = 101))

Summarize Data

cat("Anzahl Dateien: ", nrow(dataset))
## Anzahl Dateien:  105470
cat("Anzahl Kategorien: ", nrow(df))
## Anzahl Kategorien:  48
cat("Anzahl Dateien bei Filterung von tap, art, floppies, piracy, exhibits, magazines, digest:", nrow(f_selection))
## Anzahl Dateien bei Filterung von tap, art, floppies, piracy, exhibits, magazines, digest: 44833
cat("Anzahl Dateien bei zusätzlicher Filterung von charratioB >0.95:", fun_charratioB_selection(0.95))
## Anzahl Dateien bei zusätzlicher Filterung von charratioB >0.95: 7242
# draw curve for fun_charratioB
ggplot(data_fun, aes(x, n, col = categories)) +
  geom_line() + 
  xlim(0.8, 1) +
  scale_y_continuous(trans = log10_trans()) +
  ggtitle("n files if charratioB > x on a log10 scale")

datatable(df %>%
  arrange(desc(charratioB.mean)) %>%
  select(Group.1, length.n, length.mean, length_raw.mean, avgcolumnsize.mean,
        charratioA.mean, charratioB.mean, charratioDelta.mean),
  options = list( 
    pageLength = 50,
    initComplete = JS("function(settings, json) {
                      $(this.api().table().header()).css({'font-size' : '12px'});
                      $('table.dataTable thead th').css({'padding' : '10px 18px 10px 0px'});
                      }")
    )
  )

Plots

Verhältnis Text (exkl. Satz- und Leerzeichen) zu Dateilänge

dataset %>%
  ggplot( aes(x=reorder(category, charratioA, FUN = median), 
              y=charratioA, group=category)) +
    geom_boxplot() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    geom_jitter(color="black", size=0.4, alpha=0.05) +
    stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
    coord_flip() + 
    ylim(0, 1) +
    xlab("Kategorie") +
    ylab("Verhältnis")

Verhältnis Text (inkl. Satz- und Leerzeichen) zu Dateilänge

# create plot: charratioB
dataset %>%
  ggplot( aes(x=reorder(category, charratioB, FUN = median), 
              y=charratioB, group=category)) +
    geom_boxplot() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    geom_jitter(color="black", size=0.4, alpha=0.05) +
    stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
    coord_flip() + 
    ylim(0, 1) +
    xlab("Kategorie") +
    ylab("Verhältnis")

Differenz beiden Verhältnissen (inkl. minus exkl. Satz- und Leerzeichen zu Dateilänge)

dataset %>%
  ggplot( aes(x=reorder(category, charratioA-charratioB, FUN = median), 
              y=charratioB-charratioA, group=category)) +
    geom_boxplot() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    geom_jitter(color="black", size=0.4, alpha=0.05) +
    stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
    coord_flip() + 
    ylim(0, 1) +
    xlab("Kategorie") +
    ylab("Differenz beiden Verhältnissen")

Apply filtering and extend filtering

data_names_exclude <- c("fidonet-on-the-internet","tap","floppies","exhibits","artifacts","piracy", "art", "magazines", "digest")

dataset_filtered = dataset %>%
  filter(!category %in% data_names_exclude) %>%
  filter(charratioB > 0.95)

fun_length_selection_lt <- Vectorize( function(x) { nrow(dataset_filtered %>% filter(length < x)) } )
fun_length_selection_gt <- Vectorize( function(x) { nrow(dataset_filtered %>% filter(length > x)) } )

length_fun_seq = seq(min(dataset_filtered$length),max(dataset_filtered$length),1000)
length_fun <- data.frame(x = length_fun_seq,
                       n = c(fun_length_selection_lt(length_fun_seq),
                             fun_length_selection_gt(length_fun_seq)),
                       categories = rep(c("greater than", "less than"), each = length(length_fun_seq)))

ggplot(length_fun, aes(x, n, col = categories)) +
  geom_line() + 
  scale_x_continuous(trans = log10_trans()) +
  ggtitle("n files if length < or > x on a log10 scale")

dataset_filtered_2 = dataset %>%
  filter(!category %in% data_names_exclude) %>%
  filter(charratioB > 0.95) %>%
  filter(length > 1000) %>%
  filter(length < 10000)

cat("Anzahl Dateien mit gefilterter Länge: ", nrow(dataset_filtered_2))
## Anzahl Dateien mit gefilterter Länge:  2741